library(rtracklayer)
library(tidyverse)
library(ggforce)
library(GenomicRanges)
library(reshape2)
library(RColorBrewer)
# Load data
ATAC_granges <- import("../Data/ATAC_idr.optimal_peak.narrowPeak.gz", format="gz")
CAGE_granges <- import("../Data/ctss_files/hg38.CAGE_AF7N_Pool_81_A2.filt.ctss.bed.gz", format="gz")
genome(ATAC_granges) <- "hg38"
genome(CAGE_granges) <- "hg38"
ATAC_granges
## GRanges object with 258265 ranges and 6 metadata columns:
##            seqnames              ranges strand |        name     score
##               <Rle>           <IRanges>  <Rle> | <character> <numeric>
##        [1]    chr15   89087912-89088828      * |        <NA>      1000
##        [2]     chr1 206202745-206204098      * |        <NA>      1000
##        [3]    chr11     6234200-6235551      * |        <NA>      1000
##        [4]    chr11     3797056-3798236      * |        <NA>      1000
##        [5]     chr8   25458212-25459500      * |        <NA>      1000
##        ...      ...                 ...    ... .         ...       ...
##   [258261]    chr10 106089928-106090175      * |        <NA>       200
##   [258262]    chr10 104639508-104639928      * |        <NA>       200
##   [258263]    chr10 104638719-104639064      * |        <NA>       200
##   [258264]    chr10 103131426-103131736      * |        <NA>       200
##   [258265]    chr10 101874851-101875892      * |        <NA>       357
##            signalValue    pValue    qValue      peak
##              <numeric> <numeric> <numeric> <integer>
##        [1]     22.9179   3542.55   3536.38       348
##        [2]     15.9319   3417.05   3410.94       464
##        [3]     17.5065   3325.87   3319.80       718
##        [4]     20.1593   3283.13   3277.08       639
##        [5]     18.0436   3234.07   3228.04       389
##        ...         ...       ...       ...       ...
##   [258261]     2.09155   3.82812   2.18792        93
##   [258262]     2.09155   3.82812   2.18792       390
##   [258263]     2.09155   3.82812   2.18792       237
##   [258264]     2.09155   3.82812   2.18792       207
##   [258265]     2.09155   3.82812   2.18792       177
##   -------
##   seqinfo: 24 sequences from hg38 genome; no seqlengths
CAGE_granges
## GRanges object with 1486902 ranges and 2 metadata columns:
##             seqnames    ranges strand |                     name     score
##                <Rle> <IRanges>  <Rle> |              <character> <numeric>
##         [1]     chr1     17496      - |       chr1:17495-17496,-         1
##         [2]     chr1    184489      - |     chr1:184488-184489,-         1
##         [3]     chr1    611298      + |     chr1:611297-611298,+         1
##         [4]     chr1    629080      + |     chr1:629079-629080,+         1
##         [5]     chr1    629081      + |     chr1:629080-629081,+         1
##         ...      ...       ...    ... .                      ...       ...
##   [1486898]     chrY  56844175      + | chrY:56844174-56844175,+         1
##   [1486899]     chrY  56844226      - | chrY:56844225-56844226,-         2
##   [1486900]     chrY  56849211      + | chrY:56849210-56849211,+         1
##   [1486901]     chrY  56853769      - | chrY:56853768-56853769,-         2
##   [1486902]     chrY  56869591      - | chrY:56869590-56869591,-         1
##   -------
##   seqinfo: 24 sequences from hg38 genome; no seqlengths

Extract the positive set

# Extend ATAC peaks 250 bp in both directions
ATAC_granges_peaks <- GRanges(seqnames = seqnames(ATAC_granges),
                              ranges = IRanges(start = start(ATAC_granges) + ATAC_granges$peak, width = 1),
                              strand = strand(ATAC_granges))
genome(ATAC_granges_peaks) <- "hg38"

start(ATAC_granges_peaks) <- start(ATAC_granges_peaks) - 250
end(ATAC_granges_peaks) <- end(ATAC_granges_peaks) + 250
ranges(ATAC_granges_peaks)
## IRanges object with 258265 ranges and 0 metadata columns:
##                start       end     width
##            <integer> <integer> <integer>
##        [1]  89088010  89088510       501
##        [2] 206202959 206203459       501
##        [3]   6234668   6235168       501
##        [4]   3797445   3797945       501
##        [5]  25458351  25458851       501
##        ...       ...       ...       ...
##   [258261] 106089771 106090271       501
##   [258262] 104639648 104640148       501
##   [258263] 104638706 104639206       501
##   [258264] 103131383 103131883       501
##   [258265] 101874778 101875278       501
ATAC_granges_peaks
## GRanges object with 258265 ranges and 0 metadata columns:
##            seqnames              ranges strand
##               <Rle>           <IRanges>  <Rle>
##        [1]    chr15   89088010-89088510      *
##        [2]     chr1 206202959-206203459      *
##        [3]    chr11     6234668-6235168      *
##        [4]    chr11     3797445-3797945      *
##        [5]     chr8   25458351-25458851      *
##        ...      ...                 ...    ...
##   [258261]    chr10 106089771-106090271      *
##   [258262]    chr10 104639648-104640148      *
##   [258263]    chr10 104638706-104639206      *
##   [258264]    chr10 103131383-103131883      *
##   [258265]    chr10 101874778-101875278      *
##   -------
##   seqinfo: 24 sequences from hg38 genome; no seqlengths
# Remove overlapping regions
ATAC_peaks_filtered <- rownames_to_column(as.data.frame(ATAC_granges_peaks)) %>% 
  as_tibble() %>% mutate(overlaps = countOverlaps(ATAC_granges_peaks)) %>% 
  filter(overlaps==1)
ATAC_peaks_filtered
ATAC_granges_peaks_filtered <- ATAC_granges_peaks[as.numeric(ATAC_peaks_filtered$rowname)]
ATAC_granges_peaks_filtered
## GRanges object with 81919 ranges and 0 metadata columns:
##           seqnames              ranges strand
##              <Rle>           <IRanges>  <Rle>
##       [1]    chr12   55973648-55974148      *
##       [2]     chr3   50235720-50236220      *
##       [3]    chr17   15699328-15699828      *
##       [4]    chr15   90716993-90717493      *
##       [5]    chr14   77616665-77617165      *
##       ...      ...                 ...    ...
##   [81915]    chr10 112475607-112476107      *
##   [81916]    chr10 109443822-109444322      *
##   [81917]    chr10 108630531-108631031      *
##   [81918]    chr10 106089771-106090271      *
##   [81919]    chr10 103131383-103131883      *
##   -------
##   seqinfo: 24 sequences from hg38 genome; no seqlengths
ranges(ATAC_granges_peaks_filtered)
## IRanges object with 81919 ranges and 0 metadata columns:
##               start       end     width
##           <integer> <integer> <integer>
##       [1]  55973648  55974148       501
##       [2]  50235720  50236220       501
##       [3]  15699328  15699828       501
##       [4]  90716993  90717493       501
##       [5]  77616665  77617165       501
##       ...       ...       ...       ...
##   [81915] 112475607 112476107       501
##   [81916] 109443822 109444322       501
##   [81917] 108630531 108631031       501
##   [81918] 106089771 106090271       501
##   [81919] 103131383 103131883       501
length(ATAC_granges)
## [1] 258265
length(ATAC_granges_peaks_filtered)
## [1] 81919
# Get number of windows for each chr
table(seqnames(ATAC_granges_peaks_filtered))
## 
## chr15  chr1 chr11  chr8  chr5 chr10 chr19 chr13  chr4  chr9  chr3 chr12 chr14 
##  3068  7478  3981  3766  4583  3973  2190  2186  4343  3269  5942  4056  2649 
##  chr6 chr17  chr2  chr7 chr16  chrX chr18 chr22 chr20 chr21  chrY 
##  4943  2945  6981  4212  2533  2612  1946  1291  2139   831     2
# Alternatively I can collpase overlapping ranges
GenomicRanges::reduce(ATAC_granges_peaks)
## GRanges object with 144900 ranges and 0 metadata columns:
##            seqnames            ranges strand
##               <Rle>         <IRanges>  <Rle>
##        [1]    chr15 19988001-19988501      *
##        [2]    chr15 20021287-20021787      *
##        [3]    chr15 20191686-20192186      *
##        [4]    chr15 20192954-20193710      *
##        [5]    chr15 20204491-20204991      *
##        ...      ...               ...    ...
##   [144896]    chr21 46637923-46638823      *
##   [144897]    chr21 46661197-46661697      *
##   [144898]    chr21 46667073-46668259      *
##   [144899]     chrY   4991691-4992191      *
##   [144900]     chrY 15355319-15355819      *
##   -------
##   seqinfo: 24 sequences from hg38 genome; no seqlengths

Extract the windows profile

# Report time execution
report_time_execution <- function(fun){
  start_time <- Sys.time()
  output <- fun
  end_time <- Sys.time()
  print(end_time - start_time)
return(output)
}

# Return score if the position in present, 0 otherwise
get_count_vector <- function(pos, df){
  if (pos %in% df$atac_relative_pos) {
  return(df$score[which(df$atac_relative_pos == pos)])
} else {return(0)}
}

# Return score vector for each position for both strands
get_count_vector_both_strands <- function(df){
  plus_count_vector <- sapply(1:501, get_count_vector, df = df[df$strand == "+",])
  minus_count_vector <- sapply(1:501, get_count_vector, df = df[df$strand == "-",])
  return(c(plus_count_vector, minus_count_vector))
}
  
# Return the CAGE profiles of the (CAGE) overlapping ATAC regions
get_chr_windows_profiles <- function(cage_granges, atac_granges, chr){
  # Select chromosome
  cage_granges <- cage_granges[seqnames(cage_granges) == chr]
  atac_granges <- atac_granges[seqnames(atac_granges) == chr]
  # Add all information into one df
  overlaps <- findOverlaps(cage_granges, atac_granges)                   # Index of overlapping CAGE fragment
  length(overlaps)  
  # Check if there are ATAC positive windows overlapping CAGE data
  if (length(overlaps) > 0){                                                   
    df <- cage_granges[queryHits(overlaps)]                              # Keep only overlapping CAGE data
    df$index_overlapping_atac <- subjectHits(overlaps)                   # Add index of overlapping ATAC   
    df %>% as_tibble() %>%                                               # Add ATAC start site and relative position
      mutate(atac_start = start(atac_granges[subjectHits(overlaps)]),
             atac_relative_pos = start - atac_start + 1) -> df
    # Extract profiles of each (CAGE) overlapping ATAC region
    profiles <- by(data = df, 
                   INDICES = df$index_overlapping_atac, 
                   FUN = function(x){get_count_vector_both_strands(x)})
    profiles <- data.frame(do.call(rbind, profiles))
    colnames(profiles) <- c(paste("Plus_", 1:501, sep = ""), paste("Minus_", 1:501, sep = ""))
    # Add metadata information
    profiles <- profiles %>% mutate(# atac_index = sort(unique(subjectHits(overlaps))),
                                    atac_start = start(atac_granges[as.numeric(rownames(profiles))]),
                                    chr = chr) %>% relocate(c(chr, atac_start), .before = Plus_1) 
    return(profiles)
  } 
  else {
    print(paste(chr, "contains no overlapping windows"))
    return(NULL)
  }
}

# Return the windows profiles for all chromosomes
get_windows_profiles <- function(cage_granges, atac_granges){
  chromosomes <- unique(seqnames(cage_granges))
  list_chr_profiles <- lapply(chromosomes, function(x) 
        {get_chr_windows_profiles(cage_granges = cage_granges, 
         atac_granges = atac_granges,
         chr = x)})
  output_list <- list()
  output_list$profiles <- data.frame(do.call(rbind, list_chr_profiles))  
  output_list$metadata <- output_list$profiles %>% select(chr, atac_start)
  output_list$profiles <- output_list$profiles %>% select(-chr, -atac_start)
  return(output_list)
}

pos_windows <- report_time_execution(get_windows_profiles(CAGE_granges, 
                                                          ATAC_granges_peaks_filtered))
## [1] "chrY contains no overlapping windows"
## Time difference of 2.192336 mins
rows = nrow(pos_windows$metadata)
rows
## [1] 14954
krows <- round(rows/1000)

Exploration of the obtained profiles

## Explore the resulting profiles

# Plot chr distribution
plot_chr_distribution <- function(window_metadata, 
                                  title = "Chr distribution"){
  window_metadata %>% group_by(chr) %>% count() %>% 
  ggplot(aes(x = factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep="")), 
             y = n)) +
  geom_bar(stat = "identity", col = "black", fill = brewer.pal(8,"Dark2")[1]) +
  labs(title = title, x = "Chromosome", y = "N. windows") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust=1,size = 12))
}

plot_chr_distribution(pos_windows$metadata, paste("Chr distribution (Positive windows ", krows, "k)", sep=""))

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
get_cage_distribution_by_peak_position <- function(peaks_profile_df){
  apply(peaks_profile_df, 2, sum) %>% as_tibble() %>% 
  mutate(pos = c(-250:250, -250:250), strand = c(rep("+", 501), rep("-", 501))) %>% 
  rename(score = value) %>% relocate(score, .after = strand) %>%
  mutate(score = ifelse(strand == "-", -score, score))
}
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(pos_windows$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
 # facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()

# Plot distribution of ATAC-Seq peaks CAGE total coverage 
get_windows_total_cage_score_distribution <- function(peaks_profile_df){
  peaks_profile_df %>% 
    count(apply(peaks_profile_df, 1, sum)) %>% 
    rename(total_score = "apply(peaks_profile_df, 1, sum)", n_atac_peaks = n) %>%
    relocate(total_score, .after = n_atac_peaks) 
}

windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(pos_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 10000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank())

# Plot the maximum tss score of each window
max_tss_score_distribution <- rownames_to_column(pos_windows$profiles, var = "atac_start") %>% 
  mutate(max_tss_score = apply(pos_windows$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

Positive windows filtering and exploration

## Positive set filtering

# Try different filters and observe the resulting profiles
windows_profiles_filter <- function(windows,
                                    threshold = 1, fun = max){
  filter <- apply(windows$profiles, 1, fun) > threshold
    windows$profiles <-  windows$profile[filter,]
    windows$metadata <- windows$metadata[filter,]
  return(windows)
}

pos_windows_filtered_atleast_2_sum <- windows_profiles_filter(pos_windows, fun = sum)
pos_windows_filtered <- windows_profiles_filter(pos_windows)
pos_windows_filtered_atleast_3_max <- windows_profiles_filter(pos_windows, threshold = 2)

print(paste("Original windows number:", nrow(pos_windows$profiles)))
## [1] "Original windows number: 14954"
print(paste("N. windows after filtering (at least 2 reads in total):", nrow(pos_windows_filtered_atleast_2_sum$profiles)))
## [1] "N. windows after filtering (at least 2 reads in total): 9225"
print(paste("N. windows after filtering (at least a TSS with 2 reads):", nrow(pos_windows_filtered$profiles)))
## [1] "N. windows after filtering (at least a TSS with 2 reads): 6958"
print(paste("N. windows after filtering (at least a TSS with 3 reads):", nrow(pos_windows_filtered_atleast_3_max$profiles)))
## [1] "N. windows after filtering (at least a TSS with 3 reads): 3514"
rows = nrow(pos_windows_filtered$metadata)
rows
## [1] 6958
krows <- round(rows/1000)

## Plot after filtering

# Chr distribution
plot_chr_distribution(pos_windows_filtered$metadata, paste("Chr distribution (Positive windows ", krows, "k)", sep=""))

# Score distribution
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(pos_windows_filtered$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()

# Plot distribution of ATAC-Seq peaks CAGE total coverage 
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(pos_windows_filtered$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution (All chr)", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution (All chr)", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 10000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank())

# Plot the maximum tss score of each window
max_tss_score_distribution <- pos_windows_filtered$profiles %>% 
  mutate(atac_start = pos_windows_filtered$metadata$atac_start) %>%         
  mutate(max_tss_score = apply(pos_windows_filtered$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution (All chr)", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

Exploration of different chromosomes score distribution

# Get the distribution of CAGE score over one chr windows positions 
get_score_distribution_by_pos_one_chr <- function(list_windows, chr){
  windows_profile <- list_windows$profiles[list_windows$metadata == chr,]
  score_distribution <- get_cage_distribution_by_peak_position(windows_profile)
  score_distribution$chr = chr
  return(score_distribution)  
}

# Get the distribution of CAGE score for all chr
get_score_distribution_by_pos_all_chr <- function(list_windows){
  list_df <- lapply(unique(list_windows$metadata$chr), function(x){
    get_score_distribution_by_pos_one_chr(list_windows, as.character(x))}) 
  return(data.frame(do.call(rbind, list_df)))
}

windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(pos_windows_filtered)
  
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Positive set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),    
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()

Exploration of different chromosomes profiles

# Plot some profiles
plo_set_profiles <- function(windows,
                             chr,
                             details = FALSE,
                             title = "",
                             ylim = c(-50, 50), 
                             range = 1:56, sort = FALSE){
  windows_profile <- windows$profiles[windows$metadata$chr == chr,]
  windows_profile %>% 
    mutate(total_coverage = apply(windows_profile, 1, sum),
           max_coverage = apply(windows_profile, 1, max)) -> windows_profile
  # Add details (minimum TSS score and total score of the window)
  if (details){
    row_name <- paste("w_", 1:nrow(windows_profile), "\n(T=", 
                      as.character(total_coverage), ", \nM=", 
                      max_coverage, ")") 
  } else {row_name <- paste("w_", 1:nrow(windows_profile))}
  windows_profile %>%
    mutate(window = row_name) %>%
    relocate(c(window, total_coverage), .before = Plus_1) -> temp
  if (sort){
    temp %>% arrange(desc(total_coverage)) %>% slice(range) -> temp
  } 
  # Prepare for plotting
  temp <- temp %>% select(-total_coverage, -max_coverage) %>% slice(range) %>% melt()
  temp$value[grepl("^M", temp$variable)] <- -temp$value[grepl("^M", temp$variable)] 
  temp$strand[grepl("^M", temp$variable)] <- "-"
  temp$strand[grepl("^P", temp$variable)] <- "+"
  temp$variable <- as.numeric(gsub("\\D", "", temp$variable)) - 251
  # Plot
  temp %>% ggplot(aes(x = variable, y = value, color = strand)) + geom_line() + 
    facet_wrap(~window, ncol = 8, strip.position = "left") +
    coord_cartesian(ylim = ylim) +
    labs(title = paste("Windows profiles (", 
                       deparse(substitute(range)), ", ", chr, ")", sep = ""), 
         y = "TSS score",
         x = "Relative position to ATAC-Seq mid peak",
         fill = NA) +
    theme_bw() -> plot
  return(plot)
}

plo_set_profiles(pos_windows_filtered, chr = "chr1", sort=TRUE) 
## Using window as id variables

plo_set_profiles(pos_windows_filtered, chr = "chr6", sort=TRUE) 
## Using window as id variables

plo_set_profiles(pos_windows_filtered, chr = "chr11", sort=TRUE) 
## Using window as id variables

plo_set_profiles(pos_windows_filtered, chr = "chr2", sort=TRUE) 
## Using window as id variables

plo_set_profiles(pos_windows_filtered, chr = "chr3", sort=TRUE) 
## Using window as id variables

plo_set_profiles(pos_windows_filtered, chr = "chr4", sort=TRUE) 
## Using window as id variables

Extract the negative set

Export data and generate negative ranges by bedtools shuffle

# Export original atac data (before filtering them by overlaps) and positive set ranges
positive_set_granges <- GRanges(seqnames = pos_windows_filtered$metadata$chr,
                                ranges = IRanges(start = pos_windows_filtered$metadata$atac_start,
                                                 width = 501))

export(positive_set_granges, "../Data/pseudo_dreg_sets/positive_set_replicate1.bed", format = "bed")
export(ATAC_granges_peaks, "../Data/atac_positive_all_granges_501.bed", format = "bed")

# Duplicate 4 times the ATAC_peaks so to use as -i to generate more ranges with shuffle
ATAC_granges_duplicated <- GRanges(rbind(as.data.frame(ATAC_granges_peaks),
                                         as.data.frame(shift(ATAC_granges_peaks, 1)),
                                         as.data.frame(shift(ATAC_granges_peaks, 2)),
                                         as.data.frame(shift(ATAC_granges_peaks, -1)),
                                         as.data.frame(shift(ATAC_granges_peaks, -2))))
export(ATAC_granges_duplicated, "../Data/atac_positive_all_granges_501_duplicatedX4.bed", format = "bed")
# Generate negative set ranges (from pseudo_dreg directory) in such a way that:
# - They will be the same size, num and strand proportion as positive set ranges, blacklist and atac positive granges will not be included, no overlaps, try to include CAGE TSS
# - Risk: forcing the overlaps with CAGE I might risk to capture some true TSS signals that are in region where ATAC-Seq was not able to capture an actual open chromatine region
bedtools shuffle -i positive_set_replicate1.bed -g ../hg38.bed -incl ../ctss_files/timepoint_0/hg38.CAGE_AF7N_Pool_81_A2.filt.ctss.bed.gz -excl hg38-blacklist.v2.bed -excl ../atac_positive_all_granges_501.bed -noOverlapping > negative_set_replicate1_a1.bed

# - They will be the same size, num and strand proportion as atac positive granges, blacklist and atac positive granges will not be included, allow overlaps, don't force CAGE TSS
# - I will need to reiterate several times before I reach the desired number of profiles with at least a TSS
bedtools shuffle -i ../atac_positive_all_granges_501.bed -g ../hg38.bed -excl hg38-blacklist.v2.bed -excl ../atac_positive_all_granges_501.bed > negative_set_replicate1_b1.bed

# Do the same but without creating overlaps in the generated ranges
bedtools shuffle -i ../atac_positive_all_granges_501_duplicatedX4.bed -g ../hg38.bed -excl hg38-blacklist.v2.bed -excl ../atac_positive_all_granges_501.bed -noOverlapping > negative_set_replicate1_duplicateX4_noOverlaps.bed

Method A: I start and end 7k ranges that overlap atleast with 1 CAGE (forced by bedtools shuffle) Method B: Starting with 258k ranges I obtain 8k that overlap atleast with 1 CAGE (random shuffle ATAC negative regions) Starting with 650k, obtained 22k

Extraction and exploration of negative range profiles (A)

# Import negative set ranges and check for overlaps
negative_set_granges <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_a1.bed", format = "bed")
findOverlaps(negative_set_granges, positive_set_granges)
## Hits object with 0 hits and 0 metadata columns:
##    queryHits subjectHits
##    <integer>   <integer>
##   -------
##   queryLength: 6958 / subjectLength: 6958
# Extract CAGE profiles
neg_windows <- report_time_execution(get_windows_profiles(CAGE_granges, 
                                                          negative_set_granges))
## Time difference of 1.082768 mins
neg_windows$metadata
rows = nrow(neg_windows$metadata)
rows
## [1] 6958
krows <- round(rows/1000)

Exploration of the obtained profiles

## Explore the resulting profiles

# Chr distribution
plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows ", krows, "k)", sep=""))

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
 # facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()

# Plot distribution of ATAC-Seq peaks CAGE total coverage 
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 10000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank())

# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows$profiles %>% 
  mutate(atac_start = neg_windows$metadata$atac_start) %>%         
  mutate(max_tss_score = apply(neg_windows$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  #facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(neg_windows)
  
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()

# Plot some profiles
plo_set_profiles(neg_windows, chr = "chr1", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr6", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr11", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr2", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr3", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr4", sort=TRUE) 
## Using window as id variables

Extraction and exploration of negative range profiles (B, shuffling 218k ranges)

# Import negative set ranges and check for overlaps
negative_set_granges <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b1.bed", format = "bed")
findOverlaps(negative_set_granges, positive_set_granges)
## Hits object with 0 hits and 0 metadata columns:
##    queryHits subjectHits
##    <integer>   <integer>
##   -------
##   queryLength: 258265 / subjectLength: 6958
sum(countOverlaps(negative_set_granges) > 1)
## [1] 21357
# Extract CAGE profiles
neg_windows <- report_time_execution(get_windows_profiles(CAGE_granges, 
                                                          negative_set_granges))
## Time difference of 1.46191 mins
rows = nrow(neg_windows$metadata)
rows
## [1] 8123
krows <- round(rows/1000)

Exploration of the obtained profiles

## Explore the resulting profiles

# Chr distribution
plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows, ", krows, "k)", sep=""))

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
 # facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()

# Plot distribution of ATAC-Seq peaks CAGE total coverage 
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 10000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank())

# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows$profiles %>% 
  mutate(atac_start = neg_windows$metadata$atac_start) %>%         
  mutate(max_tss_score = apply(neg_windows$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  #facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(neg_windows)
  
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()

# Plot some profiles
plo_set_profiles(neg_windows, chr = "chr1", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr6", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr11", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr2", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr3", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr4", sort=TRUE)  
## Using window as id variables

Extraction and exploration of negative range profiles (B, shuffling 220k ranges than use union) (WRONG)

# Import negative set ranges and check for overlaps
#negative_set_granges_1 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b1.bed", format = "bed")
#negative_set_granges_2 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b2.bed", format = "bed")
#negative_set_granges_3 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b3.bed", format = "bed")
#negative_set_granges_4 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b4.bed", format = "bed")
#negative_set_granges_5 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b5.bed", format = "bed")

#negative_set_granges <- GenomicRanges::union(negative_set_granges_1, negative_set_granges_2)
#negative_set_granges <- GenomicRanges::union(negative_set_granges, negative_set_granges_3)
#negative_set_granges <- GenomicRanges::union(negative_set_granges, negative_set_granges_4)
#negative_set_granges <- GenomicRanges::union(negative_set_granges, negative_set_granges_5)

Extraction and exploration of negative range profiles (B2, shuffling 1200k ranges without overlaps) (Best)

# Import negative set ranges and check for overlaps
negative_set_granges <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_duplicateX4_noOverlaps.bed", format = "bed")

# Check for overlaps with ATAC positive regions and between granges in the same file
findOverlaps(negative_set_granges, positive_set_granges)
## Hits object with 0 hits and 0 metadata columns:
##    queryHits subjectHits
##    <integer>   <integer>
##   -------
##   queryLength: 1291325 / subjectLength: 6958
paste("Ranges before extraction:", length(negative_set_granges))
## [1] "Ranges before extraction: 1291325"
paste("Within overlaps:", sum(countOverlaps(negative_set_granges) > 1))
## [1] "Within overlaps: 0"
# Extract CAGE profiles
neg_windows <- report_time_execution(get_windows_profiles(CAGE_granges, 
                                                          negative_set_granges))
## Time difference of 5.977751 mins
# Check for profiles without overlapping CAGE data
non_overlapping_ranges <- apply(neg_windows$profiles, 1, sum) == 0
if (sum(non_overlapping_ranges) > 0){
  print(paste("Removing", sum(non_overlapping_ranges), "non-overlapping ranges"))
  neg_windows$profiles <- neg_windows$profiles[!non_overlapping_ranges,]
  neg_windows$metadata <- neg_windows$metadata[!non_overlapping_ranges,]
}

rows = nrow(neg_windows$metadata)
paste("Extracted profiles:", rows)
## [1] "Extracted profiles: 40847"
krows <- round(rows/1000)

Exploration of the obtained profiles

## Explore the resulting profiles

# Chr distribution
plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows ", krows, "k)", sep=""))

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
 # facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()

# Plot distribution of ATAC-Seq peaks CAGE total coverage 
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 1000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank())

# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows$profiles %>% 
  mutate(atac_start = neg_windows$metadata$atac_start) %>%         
  mutate(max_tss_score = apply(neg_windows$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  #facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(neg_windows)
  
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()

# Plot some profiles
plo_set_profiles(neg_windows, chr = "chr1", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr6", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr11", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr2", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr3", sort=TRUE) 
## Using window as id variables

plo_set_profiles(neg_windows, chr = "chr4", sort=TRUE) 
## Using window as id variables

Export negative and positive set profiles and metadata

# Sample from negative set
windows_sampling <- function(windows, size){
  uniform_sampling <- runif(size, 1, nrow(windows$metadata))
  windows$metadata <- windows$metadata[uniform_sampling,]
  windows$profiles <- windows$profiles[uniform_sampling,]
  return(windows)
}

# Plot chr distribution
plot_chr_distribution(pos_windows_filtered$metadata, paste("Chr distribution (Positive windows, ",
                      round(nrow(pos_windows_filtered$metadata)/1000), "k)", sep = ""))

plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows, ",
                      round(nrow(neg_windows$metadata)/1000), "k)", sep = ""))

neg_windows_sampled <- windows_sampling(neg_windows, size=nrow(pos_windows_filtered$metadata))
plot_chr_distribution(neg_windows_sampled$metadata, paste("Chr distribution (Negative windows sampled, ",
                      round(nrow(neg_windows_sampled$metadata)/1000), "k)", sep = ""))

## Export data for ML feeding

# Merge positive and negative examples
windows_profile <- rbind(mutate(pos_windows_filtered$profiles, label = 1), 
                         mutate(neg_windows_sampled$profiles, label = 0))
windows_metadata <- rbind(mutate(pos_windows_filtered$metadata, label = 1), 
                          mutate(neg_windows_sampled$metadata, label = 0))

# Shuffle the data
index <- sample(nrow(windows_profile))
windows_profile <- windows_profile[index,]
windows_metadata <- windows_metadata[index,]

paste("Size windows profile", nrow(windows_profile))
## [1] "Size windows profile 13916"
paste("Size positive windows profile (filtered)", nrow(pos_windows_filtered$profile))
## [1] "Size positive windows profile (filtered) 6958"
paste("Size negative windows profile (sampled)", nrow(neg_windows_sampled$profile))
## [1] "Size negative windows profile (sampled) 6958"
# Divide train and test by chr
test_index <- windows_metadata$chr %in% c("chr2", "chr3", "chr4")
windows_profile_test <- windows_profile[test_index,]
windows_metadata_test <- windows_metadata[test_index,]
windows_profile_train <- windows_profile[!test_index,]
windows_metadata_train <- windows_metadata[!test_index,]

# Export
write_csv(windows_profile, "../Data/ML_input/profiles_replicate1_b2.csv")
write_csv(windows_metadata, "../Data/ML_input/metadata_replicate1_b2.csv")
write_csv(windows_profile_test, "../Data/ML_input/profiles_replicate1_b2_test.csv")
write_csv(windows_metadata_test, "../Data/ML_input/metadata_replicate1_b2_test.csv")
write_csv(windows_profile_train, "../Data/ML_input/profiles_replicate1_b2_train.csv")
write_csv(windows_metadata_train, "../Data/ML_input/metadata_replicate1_b2_train.csv")